Modelling eel abundance in the Vilaine bassin

0. Setup

Load dependencies.

library(renv)
renv::load()
renv::restore() # Install missing librairies.
renv::status() # Check the project state.
library(zen4R)
library(here)

Download data from Zenodo. The data file is relatively heavy (~800Mb), this operation can take a few minutes.

doi <- "10.5281/zenodo.17962542"
dest_dir <- here("data")
if (!dir.exists(dest_dir)) dir.create(dest_dir, recursive = TRUE)
download_zenodo(doi, path = dest_dir)
unzip(here(dest_dir, "miste_data.zip"), exdir = dest_dir)

If everything went well, the data should have been downloaded in data/zenodo/.

1. River data

First, we focus on the river geographic data. Let’s load it.

base::load(here(dest_dir, "zenodo", "processed_data", "reseaux_hydrographiques.rda"))

For now, we will focus on the Vilaine bassin. We can view it using sfnetworks.

library(sf)
library(sfnetworks)
net <- as_sfnetwork(rht_vilaine)
plot(net, cex = 0.5)

Caution

We can see that the hydrographic network is not entirely connected. For example, a small portion in the south west is disconnected from the rest of the bassin. We should keep this in mind for later, as it could bias our statistical model.

We want to check that the network data is valid. In particular, we want to verify that there is no duplicate in the nodes, or edges data.

nodes <- net |>
    activate("nodes") |>
    st_as_sf()
sum(duplicated(nodes))
[1] 0
library(dplyr)
library(tidyr)

edges <- net |>
    activate("edges") |>
    st_as_sf()
sum(duplicated(select(edges, c("from", "to"))))
[1] 0

We see that there is no duplication for the Vilaine network. We can proceed to the next step.

2. Environmental data

2.1 Load data

base::load(here(dest_dir, "zenodo", "processed_data", "poissons_env_temp.RData"))

env
# A tibble: 5,777 × 14
   sta_id pop_id ope_id ope_date            annee distance_mer altitude
    <int>  <int>  <int> <dttm>              <dbl>        <int>    <int>
 1   9655  37720  22622 1996-09-24 10:09:00  1996           NA      305
 2   9225  35420  16755 1998-06-19 09:30:00  1998           NA       80
 3   9622  37544  16012 2000-05-22 14:45:00  2000          328       98
 4   9762  38357  17091 2006-09-28 14:00:00  2006          216       31
 5   9384  36157  17349 2006-06-21 14:30:00  2006          482      140
 6   9712  38066  17621 2016-09-08 10:30:00  2016          400      130
 7   9454  36580  17379 2010-09-14 15:00:00  2010          500      124
 8   9520  36941  17079 2007-06-14 14:30:00  2007          224       37
 9   9381  36135  17154 2006-06-20 00:00:00  2006          387      100
10   9284  35724  17596 2016-09-29 10:00:00  2016          212       32
# ℹ 5,767 more rows
# ℹ 7 more variables: surface_bv <dbl>, distance_source <dbl>, largeur <dbl>,
#   pente <dbl>, profondeur <dbl>, temp_juillet <dbl>, temp_janvier <dbl>

Let’s rename column for clarity.

env <- env |> rename(year = annee)

Station geographical coordinates are stored in points_geo_<river>. We visualize them over the hydrographic network.

plot(net, col = "black", cex = 0.5)
plot(points_geo_vilaine, col = "red", add = TRUE, pch = 16, cex = 0.5)

2.2. Visualisation of altitude data

To begin with, let’s focus on a single environment variable: the altitude.

d_env <- env |>
    select(c("sta_id", "pop_id", "altitude")) |>
    distinct()
d_env |> filter(is.na(altitude))
# A tibble: 2 × 3
  sta_id pop_id altitude
   <int>  <int>    <int>
1   8853  33379       NA
2  13994  51300       NA

We see that we have missing values for two stations: 8853 and 13994. Next, we want to relate altitude values to geographical coordinates.

pts_vilaine <- points_geo_vilaine |>
    select(c("pop_id", "geometry")) |>
    left_join(d_env, by = "pop_id") |>
    filter(!is.na(altitude))
pts_vilaine
Simple feature collection with 37 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -2.965245 ymin: 47.46923 xmax: -1.051178 ymax: 48.3493
Geodetic CRS:  WGS 84
First 10 features:
   pop_id sta_id altitude                   geometry
1   46885  12253       25 POINT (-2.429369 47.71654)
2   46166  11854      176 POINT (-2.965245 48.32017)
3   47601  12434       11 POINT (-1.796815 47.46923)
4   46671  12118        9 POINT (-2.332695 47.76529)
5   47659  12447        3  POINT (-2.142875 47.5878)
6   47613  12436        5 POINT (-1.867647 47.70129)
7   46484  12007       41  POINT (-2.35679 48.01553)
8   47460  12407       48 POINT (-1.410521 47.71718)
9   47677  12451       16 POINT (-2.563233 47.59471)
10  47326  12380       30 POINT (-1.562385 48.03841)

We see that the altitude is not measured on every sampling points, but only in 37 locations. Let’s plot them.

library(ggplot2)
set_theme(theme_minimal())
options(
    ggplot2.continuous.colour = "viridis",
    ggplot2.continuous.fill = "viridis"
)

ggplot() +
    geom_sf(data = points_geo_vilaine, color = "grey80", alpha = 0.5) +
    geom_sf(data = pts_vilaine, aes(color = altitude), size = 5) +
    labs(
        color = "Altitude (m)"
    )

We see in grey every location were we have sample of fish abundances, and in colour where we have altitude data. We have to interpolate altitude data for missing locations. The simplest way to do it is to the “nearest neighbour” interpolation.

2.3. Interpolation of altitude data

We first build proximity polygons with the terra::voronoi function.

library(terra)

d_elev <- select(pts_vilaine, c("geometry", "altitude"))
d_elev.vect <- terra::vect(d_elev)
v <- voronoi(d_elev.vect)
plot(v, "altitude")
points(d_elev.vect)

v.sf <- st_as_sf(v)
pred <- st_intersection(v.sf, points_geo_vilaine)
d_elev.nn <- select(pred, c("altitude", "pop_id", "geometry"))

ggplot() +
    geom_sf(data = pred, aes(color = altitude)) +
    geom_sf(data = pts_vilaine, aes(color = altitude), size = 5) +
    labs(
        color = "Altitude (m)"
    )

Predicted altitudes seem consistent with the nearest neighbour interpolation.

2.4. Cross-validation of altitude interpolation

Now, that we have a prediction pipeline working we will estimate its performance using cross-validation. To do so, we will do a LOOCV (Leave-One-Out Cross-Validation).

n <- nrow(d_elev)
preds <- numeric(n) # Vector filled with 0s.
for (i in 1:n) {
    train <- d_elev[-i, ]
    test <- d_elev[i, ]
    v_train <- voronoi(terra::vect(train))
    preds[i] <- st_intersection(st_as_sf(v_train), test)$altitude
}
d_elev$altitude.loo <- preds

ggplot(d_elev, aes(x = altitude, y = altitude.loo)) +
    geom_point() +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    labs(
        x = "Observed altitude (m)",
        y = "Predicted altitude (m)"
    )

rmse <- sqrt(mean((d_elev$altitude - d_elev$altitude.loo)^2))
rmse
[1] 18.74797

3. Abundance data

3.1. Load data

catches <- catches |>
    rename(
        species = esp_code_alternatif,
        year = annee,
        abundance = effectif
    )
catches
# A tibble: 69,823 × 6
   sta_id pop_id ope_id  year species abundance
    <int>  <int>  <int> <dbl> <chr>       <int>
 1   8530  31870  19943  2017 TRF           159
 2   8530  31870  19943  2017 VAI            82
 3   8530  31870  19944  2011 TRF           217
 4   8530  31870  19944  2011 VAI             7
 5   8530  31870  19945  2009 TRF           232
 6   8530  31870  19945  2009 VAI             9
 7   8530  31870  19946  2007 TRF           213
 8   8530  31870  19946  2007 VAI            36
 9   8530  31870  83029  2019 TRF           126
10   8530  31870  83029  2019 VAI           104
# ℹ 69,813 more rows
species_names <- unique(catches$species)

Next, let’s add zeros where species are not found.

catches <- catches |>
    pivot_wider(
        names_from = species,
        values_from = abundance,
        values_fill = list(abundance = 0)
    ) |>
    pivot_longer(
        cols = species_names,
        names_to = "species",
        values_to = "abundance"
    )

catches
# A tibble: 450,606 × 6
   sta_id pop_id ope_id  year species abundance
    <int>  <int>  <int> <dbl> <chr>       <int>
 1   8530  31870  19943  2017 TRF           159
 2   8530  31870  19943  2017 VAI            82
 3   8530  31870  19943  2017 LOF             0
 4   8530  31870  19943  2017 CHA             0
 5   8530  31870  19943  2017 SDF             0
 6   8530  31870  19943  2017 PFL             0
 7   8530  31870  19943  2017 BAF             0
 8   8530  31870  19943  2017 CHE             0
 9   8530  31870  19943  2017 GOU             0
10   8530  31870  19943  2017 OBR             0
# ℹ 450,596 more rows

3.2. Preliminary plots

Let’s plot abundance histogram for each species. As most species abundance are dominated by zeros, because of absence data, we plot log(abundance + 1) instead of raw abundances. This also implies that we will need to use zero-inflated distributions in our statistical model to account for this absence data.

catches |>
    ggplot(aes(x = abundance + 1)) + # add 1 to avoid log(0)
    geom_histogram(binwidth = 0.5) +
    scale_x_log10() +
    facet_wrap(~species)

Let’s plot abundance time series for the 10 most abundant species.

n_species <- 10
common_species <- catches |>
    group_by(species) |>
    summarise(mean_abundance = mean(abundance)) |>
    arrange(desc(mean_abundance)) |>
    slice(1:n_species) |>
    pull(species)

catches |>
    filter(species %in% common_species) |>
    group_by(year, species) |>
    summarise(mean_abundance = mean(abundance)) |>
    ggplot(aes(x = year, y = mean_abundance, colour = species)) +
    geom_line()

There is no clear trend, but we can’t say much at this point because trend may be masked by sampling efforts and environmental covariates.

Note

Here it would be interesting to plot the number of operations per year to see the evolution of the sampling effort.

d_eel <- catches |> filter(species == "ANG")
d_eel.avg <- d_eel |>
    group_by(pop_id) |>
    summarise(abundance_mean = mean(abundance))
data <- inner_join(d_elev.nn, d_eel.avg, by = "pop_id")

ggplot(data, aes(x = altitude, y = abundance_mean)) +
    geom_point() +
    labs(
        x = "Altitude (m)",
        y = "Eel abundance"
    )

We clearly observe the expected trend of eel abundance with altitude.

Next, we want to model the effect of altitude, while accounting for time using INLA.

4. Statistical models

Data is ready to model eel abundances in space, time and with altitude. In this section, we will build increasingly complex models.

Before doing anything fancy, let’s load INLA, and scale our predictor variable (altitude) as it is good practice.

library(INLA)

data <- inner_join(d_eel, d_elev.nn, by = "pop_id")
data$altitude_s <- scale(data$altitude)

4.1. Abundance distribution and overdisperion

Here, we focus on the choice of the distribution of our target variable: eel abundance. We will focus particularly on overdispersion, as we know that eel distribution is zero-inflated due to absence data. For the following model, we add a random effect for stations and year. We will discuss in further details these points in following sections.

4.1.1. Poisson model

We begin with a poisson GLM.

m.poisson <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "iid") +
        f(pop_id, model = "iid"),
    family = "poisson",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE, dic = TRUE)
)
summary(m.poisson)
Time used:
    Pre = 2.24, Running = 0.6, Post = 0.146, Total = 2.99 
Fixed effects:
              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)  1.921 0.165      1.594    1.922      2.244  1.922   0
altitude_s  -1.404 0.169     -1.739   -1.403     -1.075 -1.403   0

Random effects:
  Name    Model
    year IID model
   pop_id IID model

Model hyperparameters:
                      mean    sd 0.025quant 0.5quant 0.975quant  mode
Precision for year   10.86 2.719      6.395    10.57      17.03 10.03
Precision for pop_id  1.26 0.331      0.723     1.22       2.02  1.15

Deviance Information Criterion (DIC) ...............: 4552.01
Deviance Information Criterion (DIC, saturated) ....: 2685.27
Effective number of parameters .....................: -737.88

Watanabe-Akaike information criterion (WAIC) ...: 10992.89
Effective number of parameters .................: 2810.03

Marginal log-Likelihood:  -3209.39 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

We can plot the distribution of the fixed effect (altitude).

marginal.altitude <- m.poisson$marginals.fixed$altitude_s
ggplot(as.data.frame(marginal.altitude)) +
    geom_line(aes(x = x, y = y)) +
    labs(
        x = "Altitude effect size",
        y = "Probability"
    )

We can compute the highest posterior density (HDP) credible interval.

inla.hpdmarginal(0.97, m.poisson$marginals.fixed$altitude_s)
                 low      high
level:0.97 -1.774432 -1.037384

We can also sample from the posterior distribution.

m.poisson.samples <- inla.posterior.sample(100, m.poisson)

We represent the sampled effect size of altitude against the marginal distribution obtained before.

fun <- function(...) {
    altitude_s
}
altitude.sample <- inla.posterior.sample.eval(fun, m.poisson.samples)
tab <- data.frame(altitude = altitude.sample[1, ])
ggplot(tab, aes(x = altitude)) +
    geom_histogram(aes(y = after_stat(density)), bins = 18) +
    geom_line(data = as.data.frame(marginal.altitude), aes(x = x, y = y)) +
    labs(
        x = "Altitude effect size",
        y = "Density"
    )

Next, we want to simulate data from the model. This step is important as it allows us to see if our model makes sense or not. In particular, we identify where our model fails and improve it.

alt_vals <- seq(-1, 3, length.out = 100)
y.sample <- inla.posterior.sample.eval(
    fun = function(...) {
        mu <- exp(Intercept + altitude_s * alt_vals)
        abundance <- rpois(length(mu), mu)
    },
    samples = m.poisson.samples
)

library(bayestestR)
y.sum <- apply(y.sample, 1, function(x) {
    y.hdi <- hdi(x, ci = 0.89)
    c(mean = mean(x), hdi_low = y.hdi$CI_low, hdi_high = y.hdi$CI_high)
})
data.sim <- data.frame(
    altitude_s = alt_vals,
    t(y.sum)
)

ggplot(data.sim, aes(x = altitude_s, y = mean)) +
    # geom_point(data = data, aes(x = altitude_s, y = abundance), colour = "grey80") +
    geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
    geom_line(colour = "steelblue") +
    labs(
        x = "Scaled altitude",
        y = "Eel abundance"
    )

Next, we want to plot actual data against this prediction. Because we have not included spatial and temporal random effects while sampling the posterior, we are making predictions for an average site and year. Therefore, we want to average abundance data over site and year.

data.avg <- data |>
    group_by(altitude_s) |>
    summarise(
        abundance_avg = mean(abundance),
        abundance_var = var(abundance)
    )
data.avg
# A tibble: 32 × 3
   altitude_s[,1] abundance_avg abundance_var
            <dbl>         <dbl>         <dbl>
 1         -0.979         13.0           52.4
 2         -0.954         35.1          416. 
 3         -0.929         50.2         1692. 
 4         -0.904         30.3          235. 
 5         -0.830         43.1         2204. 
 6         -0.780         50.1         1004. 
 7         -0.705         37.6          350. 
 8         -0.680         42.4          624. 
 9         -0.655         26.6          332. 
10         -0.630          9.44          15.3
# ℹ 22 more rows
ggplot(data.sim, aes(x = altitude_s, y = mean)) +
    geom_point(data = data.avg, aes(x = altitude_s, y = abundance_avg), colour = "grey80") +
    geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
    geom_line(colour = "steelblue") +
    labs(
        x = "Scaled altitude",
        y = "Eel abundance"
    )

We see that our model capture the clear decreasing trend of eel abundances with altitude.

Let’s investigate overdispersion with a posterior predictive check.

y_pred_avg <- apply(y.sample, 1, mean)
y_pred_var <- apply(y.sample, 1, var)
pred_df <- data.frame(
    abundance_avg = y_pred_avg,
    abundance_var = y_pred_var
)
pred_df$type <- "Posterior predictive"
pred_df$altitude_s <- alt_vals
data.avg$type <- "Observed"
plot_df <- rbind(pred_df, data.avg)

ggplot(plot_df, aes(x = abundance_avg, y = abundance_var, colour = type)) +
    geom_point() +
    geom_abline(slope = 1, intercept = 1) +
    labs(
        x = "Observed mean abundance",
        y = "Observed variance in abundace"
    )

We see clearly that observed data is overdispersed (variance >> mean) compared to the data generated by our model. This suggest that the poisson distribution is not suited for this task.

4.1.2. Negative binomial model

To account for overdispersion, we will test the negative binomial distribution. Basically, we will repeat the previous section but with a model where poisson distribution is replaced with a negative binomial one.

m.nbinom <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "iid") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE, dic = TRUE)
)
summary(m.nbinom)
Time used:
    Pre = 0.858, Running = 0.569, Post = 0.0926, Total = 1.52 
Fixed effects:
              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)  2.036 0.158      1.721    2.037      2.344  2.037   0
altitude_s  -1.412 0.164     -1.737   -1.412     -1.092 -1.412   0

Random effects:
  Name    Model
    year IID model
   pop_id IID model

Model hyperparameters:
                                                        mean    sd 0.025quant
size for the nbinomial observations (1/overdispersion)  2.18 0.185      1.837
Precision for year                                     18.59 8.694      7.426
Precision for pop_id                                    1.46 0.411      0.806
                                                       0.5quant 0.975quant
size for the nbinomial observations (1/overdispersion)     2.17       2.56
Precision for year                                        16.73      40.78
Precision for pop_id                                       1.41       2.41
                                                        mode
size for the nbinomial observations (1/overdispersion)  2.16
Precision for year                                     13.63
Precision for pop_id                                    1.31

Deviance Information Criterion (DIC) ...............: 3440.05
Deviance Information Criterion (DIC, saturated) ....: 622.11
Effective number of parameters .....................: 53.88

Watanabe-Akaike information criterion (WAIC) ...: 3443.63
Effective number of parameters .................: 50.77

Marginal log-Likelihood:  -1783.55 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

Let’s plot the model prediction against actual data.

m.nbinom.samples <- inla.posterior.sample(1000, m.nbinom)

alt_vals <- seq(-1, 3, length.out = 100)
y.sample <- inla.posterior.sample.eval(
    fun = function(...) {
        mu <- exp(Intercept + altitude_s * alt_vals)
        abundance <- rnbinom(length(mu), mu = mu, size = theta[1])
    },
    samples = m.nbinom.samples
)

y.sum <- apply(y.sample, 1, function(x) {
    y.hdi <- hdi(x, ci = 0.89)
    c(mean = mean(x), hdi_low = y.hdi$CI_low, hdi_high = y.hdi$CI_high)
})
data.sim <- data.frame(
    altitude_s = alt_vals,
    t(y.sum)
)

ggplot(data.sim, aes(x = altitude_s, y = mean)) +
    geom_point(data = data.avg, aes(x = altitude_s, y = abundance_avg), colour = "grey80") +
    geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
    geom_line(colour = "steelblue") +
    labs(
        x = "Scaled altitude",
        y = "Eel abundance"
    )

We see that the negative binomial distribution better account for the variance. We will stick to this distribution for now, although we could have tried other distributions. Notably, the ZIP-poisson could be useful when we have many 0s.

4.2. Modeling time

Let’s now focus on modelling the effect of time. To begin with, let’s inspect the raw data.

ggplot(data, aes(x = year, y = abundance, color = altitude_s)) +
    geom_jitter(width = 0.2) +
    stat_summary(fun = mean, geom = "line", colour = "grey", size = 1) +
    labs(x = "Year", ylab = "Eel abundance", color = "Altitude (scaled)")

The plain line indicates the trend of the mean abundance over all sites. This trend is decreasing, we expect to retrieve this trend in our temporal random effects.

4.2.1. IID random effect

We begin by assuming that year random effects are independent from one another. We expect this assumption to be wrong, but let’s start from there and then explore more realistic modeling choices.

m.iid <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "iid") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE)
)
re.year <- m.iid$summary.random$year

ggplot(re.year, aes(x = ID, y = mean)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
    geom_ribbon(aes(ymin = `0.025quant`, ymax = `0.975quant`),
        alpha = 0.3, fill = "steelblue"
    ) +
    geom_line(color = "steelblue") +
    geom_point() +
    labs(
        x = "Year",
        y = "Year random effect",
    )

4.3. Modeling space

TODO